This project notebook is about customer clustering for an ad marketing campaign. I used different machine learning techniques such as k-means model, Principal Component Analysis(PCA) and autoencoders to perform dimensionality reduction.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
#import cv2
import plotly.express as px
import tensorflow as tf
from tensorflow.python.keras import Sequential
from tensorflow.keras import layers, optimizers
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, LearningRateScheduler
from IPython.display import display
from tensorflow.keras import backend as K
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.offline as py
import plotly.graph_objects as go
#from google.colab import files #library to upload files to colab notebook
%matplotlib inline
#loading my dataset
sales_df=pd.read_csv('sales_data_sample.csv', encoding='unicode_escape')
sales_df
sales_df.dtypes
sales_df['ORDERDATE']
#convert ORDERDATE into datetime pandas format
sales_df['ORDERDATE']=pd.to_datetime(sales_df['ORDERDATE'])
#Null elements in 'ADDRESSLINE2'
pd.isna(sales_df['ADDRESSLINE2']).sum()
Information concerning the types and number of non-null elements for each variable
sales_df.info()
Number of non-null elements for each variable in the dataset
sales_df.isna().sum() #we can also use 'isnull().sum()'
Since there are lots of null values in ADDRESSLINE2, STATE, POSTALCODE and TERRITORY, we decide to drop them. We have several options to take into account graphical information. We can choose to keep COUNTRY or CITY or both, we choose to keep country. Plus, variables such us names of costumers are dropped since not relevant here.
df_drop=['ADDRESSLINE1','ADDRESSLINE2','POSTALCODE','TERRITORY','PHONE','STATE','CONTACTFIRSTNAME','CONTACTLASTNAME','CUSTOMERNAME','ORDERNUMBER']
sales_df=sales_df.drop(df_drop, axis=1)
sales_df.head()
sales_df.isnull().sum() # Non-null values in the dataset
# How many unique values of country, product code and product line
sales_df.nunique()
sales_df['COUNTRY'].value_counts().index # Customer's countries
sales_df['COUNTRY'].value_counts() # Number of customer in each country
def barplot_visualization(x):
fig=plt.Figure(figsize=(12,6))
fig=px.bar(x=sales_df[x].value_counts().index,y=sales_df[x].value_counts(),color=sales_df[x].value_counts().index,height=600)
fig.show(renderer="notebook")
we call te barplot_visualisation function and we pass it COUNTRY as an argument to visualise number of items per country.
barplot_visualization('COUNTRY')
barplot_visualization('CITY') # Visualisation of number of customer per city
sales_df['STATUS'].nunique() # number of unique values for STATUS
barplot_visualization('STATUS')
#drop the status variable
sales_df.drop(columns=['STATUS'], inplace=True)
sales_df.head()
barplot_visualization('PRODUCTLINE')
def dummies(x):
dummy=pd.get_dummies(sales_df[x])
sales_df.drop(columns=x, inplace=True)
return pd.concat([sales_df,dummy], axis=1)
sales_df=dummies('COUNTRY')
sales_df.head()
Obtaining dummies for PRODUCTLINE and DEALSIZE
#Obtain dummies for 'PRODUCTLINE' and 'DEALSIZE'
sales_df=dummies('PRODUCTLINE')
sales_df=dummies('DEALSIZE')
Since the number of unique product code is 109, if we add one-hot variables, there would be additional 109 columns, we can avoid that by doing categorical encoding. It's important to avoid curse of dimensionality.
sales_df['PRODUCTCODE']=pd.Categorical(sales_df['PRODUCTCODE']).codes
# grouping data by order date
sales_df_group=sales_df.groupby(by="ORDERDATE").sum()
sales_df_group
#using a line plot
fig=plt.Figure(figsize=(12,6))
fig=px.line(x=sales_df_group.index,y=sales_df_group['SALES'],height=600)
fig.show() # we observe peak of the sales mainly in november
# correlation matrix between variables
plt.figure(figsize=(20,20))
corr_matrix=sales_df.iloc[:,:10].corr()
sns.heatmap(corr_matrix, annot=True, cbar=False)
It looks like the Quarter ID and the monthly IDs are highly correlated Then lets's drop 'QTR_ID'. Then we can observe a strong correlation between MSRP and PRICEEACH. It is normal because MSRP are standardized prices. and PRICEEACH and SALES are also highly correlated, we can understand that for when PRICEEACH goes up SALES also goes up. SALES are obviously PRICEEACH multiplied by QUANTITY.
sales_df.drop("QTR_ID", inplace=True, axis=1)
sales_df.shape
Let's plot distplots: Distplot shows the (1) histogram, (2) kde plot and (3) rug plot.
import plotly.figure_factory as ff
plt.figure(figsize=(10,10))
for i in range(8):
if sales_df.columns[i] !='ORDERLINENUMBER' and sales_df.columns[i] !='ORDERDATE':
fig=ff.create_distplot([sales_df[sales_df.columns[i]].apply(lambda x: float(x))], ['distplot'])
fig.update_layout(title_text=sales_df.columns[i])
fig.show()
sales_df.drop('ORDERDATE', axis=1, inplace=True)
Let's Visualize the relationship between variables using pairplots.
%matplotlib inline
plt.figure(figsize=(15,15))
fig=px.scatter_matrix(sales_df, dimensions=sales_df.columns[:8], color='MONTH_ID')
fig.update_layout(
title='Sales Data',
width=1100,
height=1100,
)
fig.show()
After performing this data exploratory we move to clustering of our data. We are going to use the elbow method to determine the optimal number of clusters(a minimum of three group) first.
import copy
sales_df_copy=copy.copy(sales_df)
sales_df_copy.drop("CITY", axis=1, inplace=True) # we drop the variable 'CITY'
scaler=StandardScaler()
sales_df_copy_scaled=scaler.fit_transform(sales_df_copy)
sales_df_copy_scaled.shape
Function to implement the elbow method
def elbow(k,X):
scores=[]
range_values=range(1,k)
for i in range_values:
kmeans=KMeans(n_clusters=i)
kmeans.fit(X)
scores.append(kmeans.inertia_) # inertia is the sum of the disances of samples to their cluster center
plt.plot(scores,'bx-')
plt.title('Finding right number of clusters')
plt.xlabel('clusters')
plt.ylabel('scores')
plt.show()
#calling the above function to determine the optimal number of clusters k
elbow(12,sales_df_copy_scaled)
# Cluster the data using kmeans, k=5
kmeans=KMeans(5)
kmeans.fit(sales_df_copy_scaled)
labels=kmeans.labels_ #labels of clusters
labels
kmeans.cluster_centers_.shape
#Converting clusters centers into a dataframe
cluster_centers=pd.DataFrame(data=kmeans.cluster_centers_, columns=[sales_df.columns.drop('CITY')])
cluster_centers
# In order to understand what these numbers mean, let's perform inverse transformation
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [sales_df.columns.drop('CITY')])
cluster_centers
Cluster 4 (Highest) - This group represents customers who buy items in high quantity centered around ~47, they buy items in all price range leaning towards high price items of ~99. They also correspond to the highest total sales around ~8293 and they are active throughout the year. They are the highest buyers of products with high MSRP ~158.
Cluster 3 - This group represents customers who buy items in varying quantity ~37, they tend to buy high price items ~95. Their sales is bit better average ~4442, they buy products with second highest MSRP of ~117.
Cluster 0 (lowest) - This group represents customers who buy items in low quantity ~30. They tend to buy low price items ~68. Their sales ~2030 is lower than other clusters and they are extremely active around holiday season. They buy products with low MSRP ~77.
#Adding the labels on my data(on corresponding samples)
sales_df_cluster=pd.concat([sales_df_copy, pd.DataFrame({'cluster':labels})], axis=1)
sales_df_cluster
A function to plot the histogram for each feature based on clusters
def hist_by_cluster(name_var, data,k):
'''A function to plot the histogram for each feature based on clusters'''
for i in name_var:
plt.figure(figsize=(30,6))
for j in range(k):
plt.subplot(1,k,j+1)
cluster=data[data['cluster']==j]
cluster[i].hist()
plt.title('{} \nCluster - {}'.format(i,j))
plt.show()
#Calling the above function
hist_by_cluster(sales_df_cluster.columns[:8],sales_df_cluster,5)
#reduction of original data to 3 dimensional using PCA
pca=PCA(n_components=3)
principal_comp=pca.fit_transform(sales_df_copy_scaled)
principal_comp
#Converting the PCA axes into a dataframe
pca_df=pd.DataFrame(data=principal_comp, columns=['pca1','pca2','pca3'])
pca_df.head()
#concatenate the clusters labels to the dataframe
pca_df=pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis=1)
pca_df
#Visualizing the clusters using 3D-Scatterplot
plt.figure(figsize=(30,30))
fig=px.scatter_3d(pca_df, x='pca1',y='pca2', z='pca3',color='cluster', symbol='cluster', size_max=18, opacity=0.7)
fig.update_layout(margin=dict(l=0,r=0,b=0,t=0))
Visualizing the clusters using 2D-Scatterplot: axe1 and axe2
plt.figure(figsize=(30,30))
fig=px.scatter(pca_df, x='pca1',y='pca2',color='cluster', symbol='cluster', size_max=18, opacity=0.7)
fig.update_layout(margin=dict(l=0,r=0,b=0,t=0))
sales_df_copy.shape
# Defining our autoencoder
from keras.optimizers import SGD
# Glorot Uniform initializer: https://keras.rstudio.com/reference/initializer_glorot_uniform.html
input_df=Input(shape=(37,))
x=Dense(50, activation='relu')(input_df)
x=Dense(500, activation='relu', kernel_initializer='glorot_uniform')(x)
x=Dense(500, activation='relu', kernel_initializer='glorot_uniform')(x)
x=Dense(2000, activation='relu', kernel_initializer='glorot_uniform')(x)
encoded=Dense(8, activation='relu',kernel_initializer='glorot_uniform')(x) #my encoder
x=x=Dense(500, activation='relu', kernel_initializer='glorot_uniform')(encoded)
decoded=Dense(37, activation='relu', kernel_initializer='glorot_uniform')(x) #my decoder
#autoendocder
autoencoder=Model(input_df, decoded)
#encoder - used for dimensionality reduction
encoder=Model(input_df, encoded)
autoencoder.compile(optimizer='adam',loss='mean_squared_error')
#Training our autoencoder
autoencoder.fit(sales_df_copy_scaled, sales_df_copy_scaled, batch_size=128, epochs=500, verbose=0)
autoencoder.save_weights('autoencoder_1.h5')
pred=encoder.predict(sales_df_copy_scaled)# our dimension-reduced data
#Finding optimal number of clusters using our elbow function applied to our dimension-reduced data
elbow(12,pred)
Now It is clear that the elbow method above gives an optimal number of clusters k=3
# Cluster the data using kmeans, k=3
kmeans=KMeans(3)
kmeans.fit(sales_df_copy_scaled)
labels=kmeans.labels_
labels
kmeans.cluster_centers_.shape
#Converting cluster centers into a dataframe
cluster_centers=pd.DataFrame(data=kmeans.cluster_centers_, columns=[sales_df.columns.drop('CITY')])
cluster_centers
# In order to understand what these numbers mean, let's perform inverse transformation
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [sales_df.columns.drop('CITY')])
cluster_centers
#Adding the labels on my data(on corresponding samples)
sales_df_cluster=pd.concat([sales_df_copy, pd.DataFrame({'cluster':labels})], axis=1)
sales_df_cluster
#Plot the histogram for each based on cluster
hist_by_cluster(sales_df_cluster.columns[:8],sales_df_cluster,3)
pca=PCA(n_components=3)
principal_comp=pca.fit_transform(sales_df_copy_scaled)
principal_comp
pca_df=pd.DataFrame(data=principal_comp, columns=['pca1','pca2','pca3'])
pca_df.head()
#concatenate the clusters labels to the dataframe
pca_df=pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis=1)
pca_df
#Visualizing the clusters using 3D-Scatterplot
plt.figure(figsize=(30,30))
fig=px.scatter_3d(pca_df, x='pca1',y='pca2', z='pca3',color='cluster', symbol='cluster', size_max=18, opacity=0.7)
fig.update_layout(margin=dict(l=0,r=0,b=0,t=0))